{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download & Preprocess the IMDB Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download reviews.txt and labels.txt from here: https://github.com/udacity/deep-learning/tree/master/sentiment-network\n",
    "\n",
    "def pretty_print_review_and_label(i):\n",
    "   print(labels[i] + \"\\t:\\t\" + reviews[i][:80] + \"...\")\n",
    "\n",
    "g = open('reviews.txt','r') # What we know!\n",
    "reviews = list(map(lambda x:x[:-1],g.readlines()))\n",
    "g.close()\n",
    "\n",
    "g = open('labels.txt','r') # What we WANT to know!\n",
    "labels = list(map(lambda x:x[:-1].upper(),g.readlines()))\n",
    "g.close()\n",
    "\n",
    "\n",
    "# Preprocess dataset:\n",
    "\n",
    "import sys\n",
    "\n",
    "f = open('reviews.txt')\n",
    "raw_reviews = f.readlines()\n",
    "f.close()\n",
    "\n",
    "f = open('labels.txt')\n",
    "raw_labels = f.readlines()\n",
    "f.close()\n",
    "\n",
    "tokens = list(map(lambda x:set(x.split(\" \")),raw_reviews))\n",
    "\n",
    "vocab = set()\n",
    "for sent in tokens:\n",
    "    for word in sent:\n",
    "        if(len(word)>0):\n",
    "            vocab.add(word)\n",
    "vocab = list(vocab)\n",
    "\n",
    "word2index = {}\n",
    "for i,word in enumerate(vocab):\n",
    "    word2index[word]=i\n",
    "\n",
    "input_dataset = list()\n",
    "for sent in tokens:\n",
    "    sent_indices = list()\n",
    "    for word in sent:\n",
    "        try:\n",
    "            sent_indices.append(word2index[word])\n",
    "        except:\n",
    "            \"\"\n",
    "    input_dataset.append(list(set(sent_indices)))\n",
    "\n",
    "target_dataset = list()\n",
    "for label in raw_labels:\n",
    "    if label == 'positive\\n':\n",
    "        target_dataset.append(1)\n",
    "    else:\n",
    "        target_dataset.append(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The Surprising Power of Averaged Word Vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['this tim burton remake of the original  ',\n",
       " 'certainly one of the dozen or so worst m',\n",
       " 'boring and appallingly acted  summer phe']"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "norms = np.sum(weights_0_1 * weights_0_1,axis=1)\n",
    "norms.resize(norms.shape[0],1)\n",
    "normed_weights = weights_0_1 * norms\n",
    "\n",
    "def make_sent_vect(words):\n",
    "    indices = list(map(lambda x:word2index[x],filter(lambda x:x in word2index,words)))\n",
    "    return np.mean(normed_weights[indices],axis=0)\n",
    "\n",
    "reviews2vectors = list()\n",
    "for review in tokens: # tokenized reviews\n",
    "    reviews2vectors.append(make_sent_vect(review))\n",
    "reviews2vectors = np.array(reviews2vectors)\n",
    "\n",
    "def most_similar_reviews(review):\n",
    "    v = make_sent_vect(review)\n",
    "    scores = Counter()\n",
    "    for i,val in enumerate(reviews2vectors.dot(v)):\n",
    "        scores[i] = val\n",
    "    most_similar = list()\n",
    "    \n",
    "    for idx,score in scores.most_common(3):\n",
    "        most_similar.append(raw_reviews[idx][0:40])\n",
    "    return most_similar\n",
    "\n",
    "most_similar_reviews(['boring','awful'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Matrices that Change Absolutely Nothing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1. 0. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 0. 1.]]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "a = np.array([1,2,3])\n",
    "b = np.array([0.1,0.2,0.3])\n",
    "c = np.array([-1,-0.5,0])\n",
    "d = np.array([0,0,0])\n",
    "\n",
    "identity = np.eye(3)\n",
    "print(identity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1. 2. 3.]\n",
      "[0.1 0.2 0.3]\n",
      "[-1.  -0.5  0. ]\n",
      "[0. 0. 0.]\n"
     ]
    }
   ],
   "source": [
    "print(a.dot(identity))\n",
    "print(b.dot(identity))\n",
    "print(c.dot(identity))\n",
    "print(d.dot(identity))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[13 15 17]\n",
      "[13. 15. 17.]\n"
     ]
    }
   ],
   "source": [
    "this = np.array([2,4,6])\n",
    "movie = np.array([10,10,10])\n",
    "rocks = np.array([1,1,1])\n",
    "\n",
    "print(this + movie + rocks)\n",
    "print((this.dot(identity) + movie).dot(identity) + rocks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Forward Propagation in Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def softmax(x_):\n",
    "    x = np.atleast_2d(x_)\n",
    "    temp = np.exp(x)\n",
    "    return temp / np.sum(temp, axis=1, keepdims=True)\n",
    "\n",
    "word_vects = {}\n",
    "word_vects['yankees'] = np.array([[0.,0.,0.]])\n",
    "word_vects['bears'] = np.array([[0.,0.,0.]])\n",
    "word_vects['braves'] = np.array([[0.,0.,0.]])\n",
    "word_vects['red'] = np.array([[0.,0.,0.]])\n",
    "word_vects['socks'] = np.array([[0.,0.,0.]])\n",
    "word_vects['lose'] = np.array([[0.,0.,0.]])\n",
    "word_vects['defeat'] = np.array([[0.,0.,0.]])\n",
    "word_vects['beat'] = np.array([[0.,0.,0.]])\n",
    "word_vects['tie'] = np.array([[0.,0.,0.]])\n",
    "\n",
    "sent2output = np.random.rand(3,len(word_vects))\n",
    "\n",
    "identity = np.eye(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111\n",
      "  0.11111111 0.11111111 0.11111111]]\n"
     ]
    }
   ],
   "source": [
    "layer_0 = word_vects['red']\n",
    "layer_1 = layer_0.dot(identity) + word_vects['socks']\n",
    "layer_2 = layer_1.dot(identity) + word_vects['defeat']\n",
    "\n",
    "pred = softmax(layer_2.dot(sent2output))\n",
    "print(pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How do we Backpropagate into this?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = np.array([1,0,0,0,0,0,0,0,0]) # target one-hot vector for \"yankees\"\n",
    "\n",
    "pred_delta = pred - y\n",
    "layer_2_delta = pred_delta.dot(sent2output.T)\n",
    "defeat_delta = layer_2_delta * 1 # can ignore the \"1\" like prev. chapter\n",
    "layer_1_delta = layer_2_delta.dot(identity.T)\n",
    "socks_delta = layer_1_delta * 1 # again... can ignore the \"1\"\n",
    "layer_0_delta = layer_1_delta.dot(identity.T)\n",
    "alpha = 0.01\n",
    "word_vects['red'] -= layer_0_delta * alpha\n",
    "word_vects['socks'] -= socks_delta * alpha\n",
    "word_vects['defeat'] -= defeat_delta * alpha\n",
    "identity -= np.outer(layer_0,layer_1_delta) * alpha\n",
    "identity -= np.outer(layer_1,layer_2_delta) * alpha\n",
    "sent2output -= np.outer(layer_2,pred_delta) * alpha"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Let's Train it!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\\tbathroom\\t1']]\n"
     ]
    }
   ],
   "source": [
    "import sys,random,math\n",
    "from collections import Counter\n",
    "import numpy as np\n",
    "\n",
    "f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')\n",
    "raw = f.readlines()\n",
    "f.close()\n",
    "\n",
    "tokens = list()\n",
    "for line in raw[0:1000]:\n",
    "    tokens.append(line.lower().replace(\"\\n\",\"\").split(\" \")[1:])\n",
    "\n",
    "print(tokens[0:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = set()\n",
    "for sent in tokens:\n",
    "    for word in sent:\n",
    "        vocab.add(word)\n",
    "\n",
    "vocab = list(vocab)\n",
    "\n",
    "word2index = {}\n",
    "for i,word in enumerate(vocab):\n",
    "    word2index[word]=i\n",
    "    \n",
    "def words2indices(sentence):\n",
    "    idx = list()\n",
    "    for word in sentence:\n",
    "        idx.append(word2index[word])\n",
    "    return idx\n",
    "\n",
    "def softmax(x):\n",
    "    e_x = np.exp(x - np.max(x))\n",
    "    return e_x / e_x.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(1)\n",
    "embed_size = 10\n",
    "\n",
    "# word embeddings\n",
    "embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1\n",
    "\n",
    "# embedding -> embedding (initially the identity matrix)\n",
    "recurrent = np.eye(embed_size)\n",
    "\n",
    "# sentence embedding for empty sentence\n",
    "start = np.zeros(embed_size)\n",
    "\n",
    "# embedding -> output weights\n",
    "decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1\n",
    "\n",
    "# one hot lookups (for loss function)\n",
    "one_hot = np.eye(len(vocab))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Forward Propagation with Arbitrary Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(sent):\n",
    "    \n",
    "    layers = list()\n",
    "    layer = {}\n",
    "    layer['hidden'] = start\n",
    "    layers.append(layer)\n",
    "\n",
    "    loss = 0\n",
    "\n",
    "    # forward propagate\n",
    "    preds = list()\n",
    "    for target_i in range(len(sent)):\n",
    "\n",
    "        layer = {}\n",
    "\n",
    "        # try to predict the next term\n",
    "        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))\n",
    "\n",
    "        loss += -np.log(layer['pred'][sent[target_i]])\n",
    "\n",
    "        # generate the next hidden state\n",
    "        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]\n",
    "        layers.append(layer)\n",
    "\n",
    "    return layers, loss"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Backpropagation with Arbitrary Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forward\n",
    "for iter in range(30000):\n",
    "    alpha = 0.001\n",
    "    sent = words2indices(tokens[iter%len(tokens)][1:])\n",
    "    layers,loss = predict(sent) \n",
    "\n",
    "    # back propagate\n",
    "    for layer_idx in reversed(range(len(layers))):\n",
    "        layer = layers[layer_idx]\n",
    "        target = sent[layer_idx-1]\n",
    "\n",
    "        if(layer_idx > 0):  # if not the first layer\n",
    "            layer['output_delta'] = layer['pred'] - one_hot[target]\n",
    "            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())\n",
    "\n",
    "            # if the last layer - don't pull from a later one becasue it doesn't exist\n",
    "            if(layer_idx == len(layers)-1):\n",
    "                layer['hidden_delta'] = new_hidden_delta\n",
    "            else:\n",
    "                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())\n",
    "        else: # if the first layer\n",
    "            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Weight Update with Arbitrary Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Perplexity:82.09227500075585\n",
      "Perplexity:81.87615610433569\n",
      "Perplexity:81.53705034457951\n",
      "Perplexity:80.88879456876245\n",
      "Perplexity:79.50015694256045\n",
      "Perplexity:76.04440447063566\n",
      "Perplexity:63.76523100870378\n",
      "Perplexity:34.69262611144399\n",
      "Perplexity:21.77439314730968\n",
      "Perplexity:19.74440305631078\n",
      "Perplexity:18.813349002926333\n",
      "Perplexity:17.920571868736154\n",
      "Perplexity:16.84823833832929\n",
      "Perplexity:15.302868260393344\n",
      "Perplexity:12.898616378336536\n",
      "Perplexity:9.781678937443305\n",
      "Perplexity:7.546724222346714\n",
      "Perplexity:6.4277474041777305\n",
      "Perplexity:5.685698933881173\n",
      "Perplexity:5.240514920446924\n",
      "Perplexity:4.916476504398705\n",
      "Perplexity:4.674677629541541\n",
      "Perplexity:4.494159385603734\n",
      "Perplexity:4.365041755388302\n",
      "Perplexity:4.289971726173599\n",
      "Perplexity:4.243384558378477\n",
      "Perplexity:4.192001080475404\n",
      "Perplexity:4.132556753967558\n",
      "Perplexity:4.071667181580819\n",
      "Perplexity:4.0167814473718435\n"
     ]
    }
   ],
   "source": [
    "# forward\n",
    "for iter in range(30000):\n",
    "    alpha = 0.001\n",
    "    sent = words2indices(tokens[iter%len(tokens)][1:])\n",
    "\n",
    "    layers,loss = predict(sent) \n",
    "\n",
    "    # back propagate\n",
    "    for layer_idx in reversed(range(len(layers))):\n",
    "        layer = layers[layer_idx]\n",
    "        target = sent[layer_idx-1]\n",
    "\n",
    "        if(layer_idx > 0):\n",
    "            layer['output_delta'] = layer['pred'] - one_hot[target]\n",
    "            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())\n",
    "\n",
    "            # if the last layer - don't pull from a \n",
    "            # later one becasue it doesn't exist\n",
    "            if(layer_idx == len(layers)-1):\n",
    "                layer['hidden_delta'] = new_hidden_delta\n",
    "            else:\n",
    "                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())\n",
    "        else:\n",
    "            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())\n",
    "\n",
    "    # update weights\n",
    "    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))\n",
    "    for layer_idx,layer in enumerate(layers[1:]):\n",
    "        \n",
    "        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sent))\n",
    "        \n",
    "        embed_idx = sent[layer_idx]\n",
    "        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))\n",
    "        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))\n",
    "        \n",
    "    if(iter % 1000 == 0):\n",
    "        print(\"Perplexity:\" + str(np.exp(loss/len(sent))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Execution and Output Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['sandra', 'moved', 'to', 'the', 'garden.']\n",
      "Prev Input:sandra      True:moved          Pred:is\n",
      "Prev Input:moved       True:to             Pred:to\n",
      "Prev Input:to          True:the            Pred:the\n",
      "Prev Input:the         True:garden.        Pred:bedroom.\n"
     ]
    }
   ],
   "source": [
    "sent_index = 4\n",
    "\n",
    "l,_ = predict(words2indices(tokens[sent_index]))\n",
    "\n",
    "print(tokens[sent_index])\n",
    "\n",
    "for i,each_layer in enumerate(l[1:-1]):\n",
    "    input = tokens[sent_index][i]\n",
    "    true = tokens[sent_index][i+1]\n",
    "    pred = vocab[each_layer['pred'].argmax()]\n",
    "    print(\"Prev Input:\" + input + (' ' * (12 - len(input))) +\\\n",
    "          \"True:\" + true + (\" \" * (15 - len(true))) + \"Pred:\" + pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
